{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# IEMOCAP"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 01  extract transcript"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import os\n",
    "from file_util import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_folder('../data/processed/IEMOCAP')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "256"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out_file = '../data/processed/IEMOCAP/processed_tran.csv'\n",
    "os.system('rm ' + out_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_trans( list_in_file, out_file ) :\n",
    "    \n",
    "    lines = []\n",
    "    \n",
    "    for in_file in list_in_file:\n",
    "        cnt = 0\n",
    "        \n",
    "        with open(in_file, 'r') as f:\n",
    "            lines = f.readlines()\n",
    "\n",
    "        with open(out_file, 'a') as f:\n",
    "\n",
    "            csv_writer = csv.writer( f )\n",
    "            lines = sorted(lines)                  # sort based on first element\n",
    "            \n",
    "            for line in lines:\n",
    "\n",
    "                name = line.split(':')[0].split(' ')[0].strip()\n",
    "                \n",
    "                # unwanted case \n",
    "                if name[:3] != 'Ses':             # noise transcription such as reply  M: sorry\n",
    "                    continue\n",
    "                elif name[-3:-1] == 'XX':        # we don't have matching pair in label\n",
    "                    continue\n",
    "                trans = line.split(':')[1].strip()\n",
    "                \n",
    "                cnt += 1\n",
    "                csv_writer.writerow([name, trans])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Session1, #sum files: 28\n",
      "Session2, #sum files: 58\n",
      "Session3, #sum files: 90\n",
      "Session4, #sum files: 120\n",
      "Session5, #sum files: 151\n"
     ]
    }
   ],
   "source": [
    "# [schema] ID, transcriptions [csv]\n",
    "\n",
    "list_files = []\n",
    "\n",
    "for x in xrange(5):\n",
    "    sess_name = 'Session' + str(x+1)\n",
    "\n",
    "    path = '../data/raw/IEMOCAP_full_release/' + sess_name + '/dialog/transcriptions/'\n",
    "    file_search(path, list_files)\n",
    "    list_files = sorted(list_files)\n",
    "\n",
    "    print sess_name + \", #sum files: \" + str(len(list_files))\n",
    "\n",
    "extract_trans(list_files, out_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 02 extract label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import os\n",
    "import sys\n",
    "import numpy as np\n",
    "from file_util import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "256"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out_file = '../data/processed/IEMOCAP/label.csv'\n",
    "os.system('rm ' + out_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_category = [\n",
    "                'ang',\n",
    "                'hap',\n",
    "                'sad',\n",
    "                'neu',\n",
    "                'fru',\n",
    "                'exc',\n",
    "                'fea',\n",
    "                'sur',\n",
    "                'dis',\n",
    "                'oth',\n",
    "                'xxx'\n",
    "                ]\n",
    "\n",
    "category = {}\n",
    "for c_type in list_category:\n",
    "    if category.has_key(c_type):\n",
    "        ;\n",
    "    else:\n",
    "        category[c_type] = len(category)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_category(lines):\n",
    "    is_target = True\n",
    "    \n",
    "    id = ''\n",
    "    c_label = ''\n",
    "    list_ret = []\n",
    "    \n",
    "    for line in lines:\n",
    "        \n",
    "        if is_target == True:\n",
    "            \n",
    "            try:\n",
    "                id          = line.split('\\t')[1].strip()  #  extract ID\n",
    "                c_label  = line.split('\\t')[2].strip()  #  extract category\n",
    "                if not category.has_key(c_label):\n",
    "                    print \"ERROR nokey\" + c_label\n",
    "                    sys.exit()\n",
    "                \n",
    "                list_ret.append( [id, c_label] )\n",
    "                is_target = False\n",
    "\n",
    "            except:\n",
    "                print \"ERROR \" + line\n",
    "                sys.exit()\n",
    "        \n",
    "        else:\n",
    "            if line == '\\n':\n",
    "                is_target = True\n",
    "        \n",
    "    return list_ret"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_labels( list_in_file, out_file ) :\n",
    "    id = ''\n",
    "    lines = []\n",
    "    list_ret = []\n",
    "    \n",
    "    for in_file in list_in_file:\n",
    "        \n",
    "        with open(in_file, 'r') as f:\n",
    "            lines = f.readlines()\n",
    "            lines = lines[2:]                           # remove head\n",
    "            list_ret = find_category(lines)\n",
    "            \n",
    "        list_ret = sorted(list_ret)                   # sort based on first element\n",
    "    \n",
    "        with open(out_file, 'a') as f:\n",
    "            csv_writer = csv.writer( f )\n",
    "            csv_writer.writerows( list_ret )                   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Session1, #sum files: 28\n",
      "Session2, #sum files: 58\n",
      "Session3, #sum files: 90\n",
      "Session4, #sum files: 120\n",
      "Session5, #sum files: 151\n"
     ]
    }
   ],
   "source": [
    "# [schema] ID, label [csv]\n",
    "\n",
    "list_files = []\n",
    "list_avoid_dir = ['Attribute', 'Categorical', 'Self-evaluation']\n",
    "\n",
    "for x in xrange(5):\n",
    "    sess_name = 'Session' + str(x+1)\n",
    "\n",
    "    path = '../data/raw/IEMOCAP_full_release/' + sess_name + '/dialog/EmoEvaluation/'\n",
    "    file_search(path, list_files, list_avoid_dir)\n",
    "    list_files = sorted(list_files)\n",
    "\n",
    "    print sess_name + \", #sum files: \" + str(len(list_files))\n",
    "\n",
    "extract_labels(list_files, out_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 02-1 convert label category\n",
    "- angry : 0        ang(1103)\n",
    "- happy : 1       exc(1041), hap(595)\n",
    "- sad : 2          sad(1084)   \n",
    "- neutral : 3     neu(1708)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "lines = []\n",
    "with open('../data/processed/IEMOCAP/label.csv') as f :\n",
    "    csv_reader = csv.reader(f)\n",
    "    lines = [x for x in csv_reader]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('../data/processed/IEMOCAP/processed_label.txt', 'w') as f:\n",
    "    \n",
    "    with open('../data/processed/IEMOCAP/processed_ids.txt', 'w') as f2:\n",
    "    \n",
    "        for line in lines:\n",
    "            if line[1] == 'ang':\n",
    "                f.write('0\\n')\n",
    "                f2.write(line[0]+'\\n')\n",
    "            elif line[1] == 'hap':\n",
    "                f.write('1\\n')\n",
    "                f2.write(line[0]+'\\n')\n",
    "            elif line[1] == 'exc':\n",
    "                f.write('1\\n')\n",
    "                f2.write(line[0]+'\\n')\n",
    "            elif line[1] == 'sad':\n",
    "                f.write('2\\n')\n",
    "                f2.write(line[0]+'\\n')\n",
    "            elif line[1] == 'neu':\n",
    "                f.write('3\\n')\n",
    "                f2.write(line[0]+'\\n')\n",
    "            else :\n",
    "                f.write('-1\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1103\n",
      "1636\n",
      "1084\n",
      "1708\n"
     ]
    }
   ],
   "source": [
    "lines = []\n",
    "with open('../data/processed/IEMOCAP/processed_label.txt') as f :\n",
    "    lines = f.readlines()\n",
    "lines = [x.strip() for x in lines]\n",
    "\n",
    "print len([x for x in lines if x=='0'])\n",
    "print len([x for x in lines if x=='1'])\n",
    "print len([x for x in lines if x=='2'])\n",
    "print len([x for x in lines if x=='3'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 03 verify"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "in_file = '../data/processed/IEMOCAP/processed_label.txt'\n",
    "label = []\n",
    "with open( in_file, 'r') as f:\n",
    "    label = f.readlines()\n",
    "    \n",
    "label_id = [ x.strip() for x in label ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "in_file = '../data/processed/IEMOCAP/label.csv'\n",
    "label = []\n",
    "with open( in_file, 'r') as f:\n",
    "    csv_reader = csv.reader( f )\n",
    "    label = [x for x in csv_reader]\n",
    "    \n",
    "label_id = [ x[0] for x in label]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "in_file = '../data/processed/IEMOCAP/processed_tran.csv'\n",
    "tran = []\n",
    "with open( in_file, 'r') as f:\n",
    "    csv_reader = csv.reader( f )\n",
    "    tran = [x for x in csv_reader]\n",
    "    \n",
    "tran_id = [ x[0] for x in tran]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "for l, t in zip(label_id, tran_id):\n",
    "    if l != t:\n",
    "        print 'ERROR'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 04 statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "in_file = '../data/processed/IEMOCAP/label.csv'\n",
    "label = []\n",
    "with open( in_file, 'r') as f:\n",
    "    csv_reader = csv.reader( f )\n",
    "    label = [x for x in csv_reader]\n",
    "    \n",
    "label_cat = [ x[1] for x in label]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "count = np.zeros( len(category), dtype=np.int )\n",
    "\n",
    "for cat in label_cat:\n",
    "    count[ category[cat] ] = count[ category[cat] ] + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "oth\t:3\n",
      "ang\t:1103\n",
      "exc\t:1041\n",
      "fea\t:40\n",
      "xxx\t:2507\n",
      "sad\t:1084\n",
      "sur\t:107\n",
      "fru\t:1849\n",
      "hap\t:595\n",
      "neu\t:1708\n",
      "dis\t:2\n"
     ]
    }
   ],
   "source": [
    "for key in category.keys():\n",
    "    print key + '\\t:' + str(count[ category[key] ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:tf14_p27]",
   "language": "python",
   "name": "conda-env-tf14_p27-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
